# PoP - Policing Socio-Geographic Boundaries and Inequality
# script for creating final dataset of police stops for Austin.

# load packages
suppressPackageStartupMessages(
  
  {
    library(dplyr)
    library(tidyverse)
    library(ggplot2)
    library(haven)
    library(readxl)
    library(readr)
    library(areal)
    library(car)
    library(estimatr)
    library(magrittr)
    library(texreg)
    library(sandwich)
    library(jtools)
  }
)

# load data by year
# 2014
rpd_2014 <- read_csv("2014_Racial_Profiling_Arrests.csv")
# doesnt have lat, lon coordinates, cant use

# 2015
rpd_2015 <- read_csv("2015_Racial_Profiling_Arrests (1).csv")
# has coords, 15 vars

# 2016 
rpd_2016 <- read_csv("2016_RP_Arrests.csv")
# has coords, 15 vars

# 2017
rpd_2017 <- read_csv("2017_Racial_Profiling_Arrests.csv")
# has coords, 15 vars

# 2018
rpd_2018 <- read_csv("2018_RP_Arrests.csv")
# has coords, 18 vars
colnames(rpd_2017)
colnames(rpd_2018)
# additional vars = council district, county_description, ZIP

# 2019
rpd_2019 <- read_csv("2019_Racial_Profiling__RP__Arrests.csv")
# has coords, 18 vars
colnames(rpd_2019)
# additional vars = council district, county_description, ZIP

# 2020
rpd_2020 <- read_csv("2020_Racial_Profiling__RP__dataset.csv")
# has coords, 24 vars
colnames(rpd_2020)
# additional vars = council district, county_description, ZIP, street type, custody, standardized race, 
# type, tcole race ethnicity

# select key vars
# 2015
rpd_15_sm = rpd_2015 %>% dplyr::select(c(PRIMARY_KEY, REP_DATE, APD_RACE_DESC, PERSON_SEARCHED_DESC,
                                         REASON_FOR_STOP_DESC, SEARCH_DISC_DESC, X_COORDINATE, Y_COORDINATE))
# 2016
rpd_16_sm = rpd_2016 %>% dplyr::select(c(PRIMARY_KEY, REP_DATE, APD_RACE_DESC, PERSON_SEARCHED_DESC,
                                         REASON_FOR_STOP_DESC, SEARCH_DISC_DESC, X_COORDINATE, Y_COORDINATE))
# 2017
rpd_17_sm = rpd_2017 %>% dplyr::select(c(PRIMARY_KEY, REP_DATE, APD_RACE_DESC, PERSON_SEARCHED_DESC,
                                         REASON_FOR_STOP_DESC, CORRECTED_search_discovered, X_COORDINATE, Y_COORDINATE))

# 2018
rpd_18_sm = rpd_2018 %>% dplyr::select(c(PRIMARY_KEY, REP_DATE, APD_RACE_DESC, `Person Search YN`, 
                                         `Reason for Stop – TCOLE form`, `Search Found`, X_COORDINATE, Y_COORDINATE))
# 2019
rpd_19_sm = rpd_2019 %>% dplyr::select(c(PRIMARY_KEY, REP_DATE, APD_RACE_DESC, `Person Search YN`, 
                                         `Reason for Stop – TCOLE form`, `Search Found`, X_COORDINATE, Y_COORDINATE))

# 2020
rpd_20_sm = rpd_2020 %>% dplyr::select(c(`Stop Key`, `Stop Date`, `Standardized Race`,
                                         `Search Yes or No`, `Reason for Stop`, `TCOLE Search Found`, 
                                         X_COORDINATE, Y_COORDINATE))

# now we need to standardize var names to bind
# standard list = PRIMARY_KEY, REP_DATE, APD_RACE_DESC, PERSON_SEARCH, REASON_FOR_STOP, 
# SEARCH_FOUND, X_COORDINATE, Y_COORDINATE

rpd_15_sm = rpd_15_sm %>% rename(PERSON_SEARCH = PERSON_SEARCHED_DESC, REASON_FOR_STOP = REASON_FOR_STOP_DESC, 
                                 SEARCH_FOUND = SEARCH_DISC_DESC)

rpd_16_sm = rpd_16_sm %>% rename(PERSON_SEARCH = PERSON_SEARCHED_DESC, REASON_FOR_STOP = REASON_FOR_STOP_DESC, 
                                 SEARCH_FOUND = SEARCH_DISC_DESC)


rpd_17_sm = rpd_17_sm %>% rename(PERSON_SEARCH = PERSON_SEARCHED_DESC, REASON_FOR_STOP = REASON_FOR_STOP_DESC, 
                                 SEARCH_FOUND = CORRECTED_search_discovered)

rpd_18_sm = rpd_18_sm %>% rename(PERSON_SEARCH = `Person Search YN`, REASON_FOR_STOP = `Reason for Stop – TCOLE form`, 
                                 SEARCH_FOUND = `Search Found`)

rpd_19_sm = rpd_19_sm %>% rename(PERSON_SEARCH = `Person Search YN`, REASON_FOR_STOP = `Reason for Stop – TCOLE form`, 
                                 SEARCH_FOUND = `Search Found`)

rpd_20_sm = rpd_20_sm %>% rename(PRIMARY_KEY = `Stop Key`, REP_DATE = `Stop Date`, APD_RACE_DESC = `Standardized Race`,
                                 PERSON_SEARCH = `Search Yes or No`, REASON_FOR_STOP = `Reason for Stop`, 
                                 SEARCH_FOUND = `TCOLE Search Found`)

# bind all years together
rpd_aus <- rbind(rpd_15_sm, rpd_16_sm,rpd_17_sm,rpd_18_sm,rpd_19_sm,rpd_20_sm)
# 118122 obs of 8 vars

# create stop_year variable

# 2015
class(rpd_15_sm$REP_DATE)
rpd_15_sm$stop_year = substring(rpd_15_sm$REP_DATE, 7,10)
table(rpd_15_sm$stop_year)
# 2016
class(rpd_16_sm$REP_DATE)
rpd_16_sm$stop_year = substring(rpd_16_sm$REP_DATE, 7,10)
table(rpd_16_sm$stop_year)
# 2017
class(rpd_17_sm$REP_DATE)
rpd_17_sm$stop_year = substring(rpd_17_sm$REP_DATE, 7,10)
table(rpd_17_sm$stop_year)
# 2018
class(rpd_18_sm$REP_DATE)
rpd_18_sm$stop_year = substring(rpd_18_sm$REP_DATE, 7,10)
table(rpd_18_sm$stop_year)

# 2019
class(rpd_19_sm$REP_DATE)
rpd_19_sm$stop_year = substring(rpd_19_sm$REP_DATE, 7,10)
table(rpd_19_sm$stop_year)

rpd_19_sm = rpd_19_sm %>%
  mutate(stop_year = case_when((stop_year == "2019") ~ "2019",
                               (stop_year == "19") ~ "2019",
                               (stop_year == "019") ~ "2019",
                               TRUE ~ NA))
table(rpd_19_sm$stop_year)

# 2020
class(rpd_20_sm$REP_DATE)
rpd_20_sm$stop_year = substring(rpd_20_sm$REP_DATE, 1,4)
table(rpd_20_sm$stop_year)


# save
save(x = rpd_aus, file = "rpd_aus.RData")


load("rpd_aus.RData")

# make binary numeric vars for key cols
table(rpd_aus$APD_RACE_DESC)

# race of arrestee
rpd_aus = rpd_aus %>%
  mutate(r_black = ifelse(APD_RACE_DESC == "BLACK", 1, 0)) %>%
  mutate(r_white = ifelse(APD_RACE_DESC == "WHITE", 1, 0)) %>%
  mutate(r_hisp = ifelse(APD_RACE_DESC == "HISPANIC OR LATINO", 1, 0)) %>%
  mutate(r_asian = ifelse(APD_RACE_DESC == "ASIAN" | APD_RACE_DESC == "MIDDLE EASTERN", 1, 0)) %>%
  mutate(r_nonwhite = ifelse(APD_RACE_DESC == "BLACK" | APD_RACE_DESC == "HISPANIC OR LATINO" | APD_RACE_DESC == "ASIAN" | 
                               APD_RACE_DESC == "MIDDLE EASTERN" | APD_RACE_DESC == "AMERICAN INDIAN/ALASKAN NATIVE" | 
                               APD_RACE_DESC == "HAWAIIAN/PACIFIC ISLANDER", 1, 0))

# if person was searched
rpd_aus = rpd_aus %>%
  mutate(searched = ifelse(PERSON_SEARCH == "Yes = 1" | PERSON_SEARCH == "YES", 1, 0)) %>%
  mutate(nothing_found = ifelse(SEARCH_FOUND == "NOTHING", 1, 0))

# code reason for stop
table(rpd_aus$REASON_FOR_STOP)

rpd_aus = rpd_aus %>%
  mutate(traffic = case_when((REASON_FOR_STOP == "MOTOR VEHICLE DRIVER") ~ 1,
                             (REASON_FOR_STOP == "Moving Traffic Violation") ~ 1,
                             (REASON_FOR_STOP == "Vehicle Traffic Violation") ~ 1,
                             (REASON_FOR_STOP == "VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS") ~ 1,
                             TRUE ~ 0))

rpd_aus = rpd_aus %>%
  mutate(service_call = ifelse(REASON_FOR_STOP == "CALL FOR SERVICE", 1, 0))

rpd_aus = rpd_aus %>%
  mutate(warrant = ifelse(REASON_FOR_STOP == "Pre-existing knowledge (i.e. warrant)", 1, 0))

rpd_aus = rpd_aus %>%
  mutate(ped_stop = case_when((REASON_FOR_STOP == "PRE-EXISTING KNOWLEDGE") ~ 1,
                              (REASON_FOR_STOP == "SUSPICIOUS PERSON / VEHICLE") ~ 1,
                              (REASON_FOR_STOP == "VIOLATION OF CITY ORDIANCE") ~ 1,
                              (REASON_FOR_STOP == "VIOLATION OF PENAL CODE") ~ 1,
                              (REASON_FOR_STOP == "WATER SAFETY ACT") ~ 1,
                              (REASON_FOR_STOP == "Violation of law other than traffic") ~ 1,
                              TRUE ~ 0))
rpd_aus = rpd_aus %>%
  mutate(stop_other = case_when((REASON_FOR_STOP == "CONSENSUAL CONTACT") ~ 1,
                                (REASON_FOR_STOP == "OTHER") ~ 1,
                                TRUE ~ 0))


# save
save(x = rpd_aus, file = "rpd_aus.RData")


load("rpd_aus.RData")
# Turning data into geometric object
sum(is.na(rpd_aus$X_COORDINATE))
# 21938 NAs for location coords

# remove NAs
rpd_aus_geo = rpd_aus %>% filter(!is.na(X_COORDINATE))
rpd_aus_geo = rpd_aus_geo %>% filter(!is.na(Y_COORDINATE))

class(rpd_aus_geo$X_COORDINATE)

# convert from character to numeric
rpd_aus_geo$X_COORDINATE <- as.numeric(rpd_aus_geo$X_COORDINATE)
rpd_aus_geo$Y_COORDINATE <- as.numeric(rpd_aus_geo$Y_COORDINATE)

# remove newly introduced NAs
rpd_aus_geo = rpd_aus_geo %>% filter(!is.na(X_COORDINATE))
rpd_aus_geo = rpd_aus_geo %>% filter(!is.na(Y_COORDINATE))


# set X and Y coords
library(sf)
rpd_aus_geo = rpd_aus_geo %>% st_as_sf(coords = c("X_COORDINATE", "Y_COORDINATE"))



# load in final aus dataset
load("aus_final.RData")
# check crs for final df
st_crs(aus_fin)
# EPSG:4269 on NAD83


# coords are on EPSG:2277 on State Plane Texas Central (NAD83); so set initial coords to that system  
st_crs(rpd_aus_geo)
st_crs(rpd_aus_geo) <- 2277

# then transform to EPSG:4269 (on NAD83) to match aus_fin
rpd_aus_geo$geometry = st_transform(rpd_aus_geo$geometry, "EPSG:4269")

# rpd_aus_geo$geometry = st_transform(rpd_aus_geo$geometry, "EPSG:4326")


# set CRS between ped stops data and aus_fin
st_crs(rpd_aus_geo) = st_crs(aus_fin)
rpd_aus_geo = st_transform(rpd_aus_geo, st_crs(aus_fin))

aus_stops = rpd_aus_geo


# find intersection between stop data and austin blk groups 
aus_ints = st_intersects(aus_stops, aus_fin)

# now filter by race of arrestee
aus_ints2a = st_intersects(aus_stops %>% filter(r_black == 1), aus_fin)
aus_ints2b = st_intersects(aus_stops %>% filter(r_hisp == 1), aus_fin)
aus_ints2c = st_intersects(aus_stops %>% filter(r_white == 1), aus_fin)
aus_ints2d = st_intersects(aus_stops %>% filter(r_asian == 1), aus_fin)
aus_ints2e = st_intersects(aus_stops %>% filter(r_nonwhite == 1), aus_fin)

# sum total stops by blk
theout = aus_fin[aus_ints %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>% 
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(all_stops_total = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

# sum total stops by race
theout2a = aus_fin[aus_ints2a %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(all_stops_black = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout2b = aus_fin[aus_ints2b %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(all_stops_latino = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout2c = aus_fin[aus_ints2c %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(all_stops_white = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout2d = aus_fin[aus_ints2d %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(all_stops_asian = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout2e = aus_fin[aus_ints2e %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(all_stops_nonwhite = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

# remerge with aus_fin

aus_fin2 = merge(aus_fin, theout, by = "BLK_CODE", all.x = TRUE)
aus_fin2 = merge(aus_fin2, theout2a, by = "BLK_CODE", all.x = TRUE)
aus_fin2 = merge(aus_fin2, theout2b, by = "BLK_CODE", all.x = TRUE)
aus_fin2 = merge(aus_fin2, theout2c, by = "BLK_CODE", all.x = TRUE)
aus_fin2 = merge(aus_fin2, theout2d, by = "BLK_CODE", all.x = TRUE)
aus_fin2 = merge(aus_fin2, theout2e, by = "BLK_CODE", all.x = TRUE)


sum(is.na(aus_fin2$all_stops_total))

sum(aus_fin2$all_stops_total, na.rm = TRUE)

# now filter by just pedestrian stops
# find intersection between stop data and austin blk groups 
aus_ints2 = st_intersects(aus_stops %>% filter(ped_stop == 1), aus_fin)

# now filter by race of arrestee
aus_ints3a = st_intersects(aus_stops %>% filter(ped_stop == 1 & r_black == 1), aus_fin)
aus_ints3b = st_intersects(aus_stops %>% filter(ped_stop == 1 & r_hisp == 1), aus_fin)
aus_ints3c = st_intersects(aus_stops %>% filter(ped_stop == 1 & r_white == 1), aus_fin)
aus_ints3d = st_intersects(aus_stops %>% filter(ped_stop == 1 & r_asian == 1), aus_fin)
aus_ints3e = st_intersects(aus_stops %>% filter(ped_stop == 1 & r_nonwhite == 1), aus_fin)

# sum total stops by blk
theout2 = aus_fin[aus_ints2 %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>% 
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(ped_stops_total = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

# sum total stops by race
theout3a = aus_fin[aus_ints3a %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(ped_stops_black = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout3b = aus_fin[aus_ints3b %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(ped_stops_latino = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout3c = aus_fin[aus_ints3c %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(ped_stops_white = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout3d = aus_fin[aus_ints3d %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(ped_stops_asian = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

theout3e = aus_fin[aus_ints3e %>% unlist, ] %>% 
  dplyr::select(BLK_CODE) %>%
  mutate(count = 1) %>% 
  group_by(BLK_CODE) %>% 
  summarize(ped_stops_nonwhite = sum(count, na.rm = TRUE)) %>% 
  as.data.frame %>% 
  mutate(geometry = NULL)

# remerge with aus_fin

aus_fin3 = merge(aus_fin2, theout2, by = "BLK_CODE", all.x = TRUE)
aus_fin3 = merge(aus_fin3, theout3a, by = "BLK_CODE", all.x = TRUE)
aus_fin3 = merge(aus_fin3, theout3b, by = "BLK_CODE", all.x = TRUE)
aus_fin3 = merge(aus_fin3, theout3c, by = "BLK_CODE", all.x = TRUE)
aus_fin3 = merge(aus_fin3, theout3d, by = "BLK_CODE", all.x = TRUE)
aus_fin3 = merge(aus_fin3, theout3e, by = "BLK_CODE", all.x = TRUE)

# convert NAs to 0s
# all stops
aus_fin3$all_stops_total <- ifelse(is.na(aus_fin3$all_stops_total), 0, aus_fin3$all_stops_total)
aus_fin3$all_stops_black <- ifelse(is.na(aus_fin3$all_stops_black), 0, aus_fin3$all_stops_black)
aus_fin3$all_stops_white <- ifelse(is.na(aus_fin3$all_stops_white), 0, aus_fin3$all_stops_white)
aus_fin3$all_stops_latino <- ifelse(is.na(aus_fin3$all_stops_latino), 0, aus_fin3$all_stops_latino)
aus_fin3$all_stops_asian <- ifelse(is.na(aus_fin3$all_stops_asian), 0, aus_fin3$all_stops_asian)
aus_fin3$all_stops_nonwhite <- ifelse(is.na(aus_fin3$all_stops_nonwhite), 0, aus_fin3$all_stops_nonwhite)


# ped stops
aus_fin3$ped_stops_total <- ifelse(is.na(aus_fin3$ped_stops_total), 0, aus_fin3$ped_stops_total)
aus_fin3$ped_stops_black <- ifelse(is.na(aus_fin3$ped_stops_black), 0, aus_fin3$ped_stops_black)
aus_fin3$ped_stops_white <- ifelse(is.na(aus_fin3$ped_stops_white), 0, aus_fin3$ped_stops_white)
aus_fin3$ped_stops_latino <- ifelse(is.na(aus_fin3$ped_stops_latino), 0, aus_fin3$ped_stops_latino)
aus_fin3$ped_stops_asian <- ifelse(is.na(aus_fin3$ped_stops_asian), 0, aus_fin3$ped_stops_asian)
aus_fin3$ped_stops_nonwhite <- ifelse(is.na(aus_fin3$ped_stops_nonwhite), 0, aus_fin3$ped_stops_nonwhite)

aus_stops_final = aus_fin3

# save
save(x = aus_stops_final, file = "aus_stops_final.RData")

